Phyloseq R package

Phyloseq tutorial

Dataset

rm(list=ls(all=TRUE))
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(microbiome)
## Loading required package: phyloseq
## 
## microbiome R package (microbiome.github.com)
##     
## 
## 
##  Copyright (C) 2011-2022 Leo Lahti, 
##     Sudarshan Shetty et al. <microbiome.github.io>
## 
## 
## Attaching package: 'microbiome'
## 
## The following object is masked from 'package:ggplot2':
## 
##     alpha
## 
## The following object is masked from 'package:base':
## 
##     transform
library(ggh4x)

Read relative abundance dataset:

physeq <- read_rds("Data/phyloseq.rds")

physeq
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 197 taxa and 55 samples ]
## sample_data() Sample Data:       [ 55 samples by 26 sample variables ]
## tax_table()   Taxonomy Table:    [ 197 taxa by 8 taxonomic ranks ]

Extract taxon table:

tax_table <- physeq %>% tax_table() %>% as.data.frame() %>% rownames_to_column("OTU") %>% as_tibble()

tax_table

Sample attributes:

sample_attributes <- physeq %>% sample_data() %>% data.frame() %>% rownames_to_column("Sample") %>% colnames()

sample_attributes
##  [1] "Sample"                      "ATTRIBUTE_SampleType"       
##  [3] "ATTRIBUTE_SampleTypeSub1"    "NCBITaxonomy"               
##  [5] "YearOfAnalysis"              "SubjectIdentifierAsRecorded"
##  [7] "AgeInYears"                  "ATTRIBUTE_BiologicalSex"    
##  [9] "UBERONBodyPartName"          "TermsofPosition"            
## [11] "HealthStatus"                "DOIDCommonName"             
## [13] "ComorbidityListDOIDIndex"    "SampleCollectionDateandTime"
## [15] "Country"                     "HumanPopulationDensity"     
## [17] "LatitudeandLongitude"        "DepthorAltitudeMeters"      
## [19] "qiita_sample_name"           "UniqueSubjectID"            
## [21] "LifeStage"                   "UBERONOntologyIndex"        
## [23] "DOIDOntologyIndex"           "ATTRIBUTE_Ethnic_Group"     
## [25] "ATTRIBUTE_Study_Group"       "Sebumeter_Score"            
## [27] "Skicon_Score"

Experimental groups:

physeq %>% sample_data() %>% as_tibble() %>% count(ATTRIBUTE_Study_Group)

Relative family abundances

Convert Phyloseq object to long format:

data_long <- physeq %>% 
  psmelt() %>% as_tibble()

data_long

What are the most abundant families across all samples?

family_abundance <- data_long %>% 
  group_by(Family) %>% 
  summarize(Abundance = sum(Abundance)/n_distinct(Sample)) %>% 
  arrange(-Abundance)

family_abundance

Select the largest families for downstream analysis:

major_families <- family_abundance$Family[1:6]

major_families
## [1] "Propionibacteriaceae" "Staphylococcaceae"    "Polyomaviridae"      
## [4] "Malasseziaceae"       "Corynebacteriaceae"   "Micrococcaceae"

Family level

Add additional column Major_Families to the long dataset as well as to the Phyloseq object:

data_long <- data_long %>% 
  mutate(
    Major_Families = if_else(
      Family %in% major_families,
      Family, "Other families"
    ),
    Major_Families = factor(Major_Families, levels = c(major_families, "Other families"))
  )

tax_table(physeq) <- tax_table(physeq) %>% 
  as.data.frame() %>% 
  mutate(
    Major_Families = if_else(
      Family %in% major_families,
      Family, "Other families"
    )
  ) %>% 
  as.matrix()

Plot relative abundance of the major families:

data_long %>%
  inner_join(
    data_long %>% 
      group_by(ATTRIBUTE_Study_Group, Sample) %>% 
      summarize(
        Propionibacteriaceae = sum(Abundance[Family == "Propionibacteriaceae"]),
        .groups = "drop_last"
        ) %>% 
      arrange(-Propionibacteriaceae) %>% 
      mutate(
        Sample_No = row_number()
      ) %>% 
      select(ATTRIBUTE_Study_Group, Sample, Sample_No),
    by = c("ATTRIBUTE_Study_Group", "Sample")
  ) %>% 
  mutate(`Relative abundance` = Abundance) %>% 
  ggplot() +
  geom_col(
    aes(interaction(Sample_No, ATTRIBUTE_Study_Group), `Relative abundance`, fill = Major_Families)
    ) +
  scale_x_discrete(guide = guide_axis_nested(n.dodge = 0, title = "Study group")) +
  theme(
    axis.ticks.x = element_blank(),
    strip.text.x = element_text(angle = 90, vjust = 0.5, hjust=0),
    legend.position = "bottom",
    legend.title = element_blank()
    ) +
  scale_fill_brewer(palette = "Set1")

  • Most volunteers from both study groups are dominated by either Propionibacteriaceae or Staphylococcaceae. Three volunteers have a major proportion of Polyomaviridae, a few more have a minor proportion.
  • Although the range of relative proportions of Propionibacteriaceae and Staphylococcaceae is similar in both study groups, there are more volunteers with a high proportion of Propionibacteriaceae in the Oily Scalp group.

Label samples with more than 50% Propionibacteriaceae in the Phyloseq object:

sample_data(physeq) <- sample_data(physeq) %>% 
  data.frame() %>% 
  rownames_to_column("Sample") %>% 
  inner_join(
    data_long %>% 
      group_by(Sample) %>% 
      summarize(
        Propionibacteriaceae = sum(Abundance[Family == "Propionibacteriaceae"]),
        Polyomaviridae       = sum(Abundance[Family == "Polyomaviridae"])
        ) %>% 
      mutate(
        Propionibacteriaceae = Propionibacteriaceae > 0.5,
        Polyomaviridae       = Polyomaviridae > 0.1
      ),
    by = "Sample"
  ) %>% 
  column_to_rownames("Sample")

Look at Propionibacteriaceae:

data_long <- data_long %>% 
  mutate(
    Propionibacteria = if_else(
      Family == "Propionibacteriaceae" & is.na(Species), 
      Genus, Species
    )
  )

Propionibacteria <- data_long %>% 
  filter(Family == "Propionibacteriaceae") %>% 
  group_by(Propionibacteria) %>% 
  summarize(Abundance = sum(Abundance)/n_distinct(Sample)) %>% 
  arrange(-Abundance) %>% 
  pull(Propionibacteria)

data_long %>%
  inner_join(
    data_long %>% 
      group_by(ATTRIBUTE_Study_Group, Sample) %>% 
      summarize(
        Propionibacteriaceae = sum(Abundance[Family == "Propionibacteriaceae"]),
        .groups = "drop_last"
        ) %>% 
      arrange(-Propionibacteriaceae) %>% 
      mutate(
        Sample_No = row_number()
      ) %>% 
      select(ATTRIBUTE_Study_Group, Sample, Sample_No),
    by = c("ATTRIBUTE_Study_Group", "Sample")
  ) %>% 
  filter(Family == "Propionibacteriaceae") %>% 
  mutate(
    `Relative abundance` = Abundance
    ) %>% 
  ggplot() +
  geom_col(
    aes(interaction(Sample_No, ATTRIBUTE_Study_Group), `Relative abundance`, fill = Propionibacteria)
    ) +
  scale_x_discrete(guide = guide_axis_nested(n.dodge = 0, title = "Study group")) +
  theme(
    axis.ticks.x = element_blank(),
    strip.text.x = element_text(angle = 90, vjust = 0.5, hjust=0),
    legend.position = "bottom",
    legend.title = element_blank()
    ) +
  scale_fill_brewer(palette = "Set1", limits = Propionibacteria)

Propionibacteriaceae

Look at strains of Propionibacteriaceae:

data_long <- data_long %>% 
 mutate(
   Propionibacterium_varieties = if_else(
     Family == "Propionibacteriaceae" & is.na(Variety), 
     Propionibacteria, Variety
   )
 )

Propionibacterium_varieties <- data_long %>% 
  filter(Family == "Propionibacteriaceae") %>% 
  group_by(Propionibacterium_varieties) %>% 
  summarize(Abundance = sum(Abundance)/n_distinct(Sample)) %>% 
  arrange(-Abundance) %>% 
  pull(Propionibacterium_varieties)

data_long %>%
  inner_join(
    data_long %>% 
      group_by(ATTRIBUTE_Study_Group, Sample) %>% 
      summarize(
        Propionibacteriaceae = sum(Abundance[Family == "Propionibacteriaceae"]),
        .groups = "drop_last"
        ) %>% 
      arrange(-Propionibacteriaceae) %>% 
      mutate(
        Sample_No = row_number()
      ) %>% 
      select(ATTRIBUTE_Study_Group, Sample, Sample_No),
    by = c("ATTRIBUTE_Study_Group", "Sample")
  ) %>% 
  filter(Family == "Propionibacteriaceae") %>% 
  mutate(
    `Relative abundance` = Abundance
    ) %>% 
  ggplot() +
  geom_col(
    aes(interaction(Sample_No, ATTRIBUTE_Study_Group), `Relative abundance`, fill = Propionibacterium_varieties)
    ) +
  scale_x_discrete(guide = guide_axis_nested(n.dodge = 0, title = "Study group")) +
  theme(
    axis.ticks.x = element_blank(),
    strip.text.x = element_text(angle = 90, vjust = 0.5, hjust=0),
    legend.position = "right",
    legend.title = element_blank()
    ) +
  scale_fill_brewer(palette = "Set1", limits = Propionibacterium_varieties)

  • The null hypothesis cannot be rejected that Propionibacterium_acnes_unclassified, Propionibacteriaceae_unclassified, and GCF_000221145 are in fact the same strain.

  • All taxa from this family should be summarized as Propionibacteriaceae, with almost all reads being Propionibacterium_acnes

Staphylococcaceae

Look at Staphylococcaceae:

Staphylococci <- data_long %>% 
  filter(Family == "Staphylococcaceae") %>% 
  group_by(Species) %>% 
  summarize(Abundance = sum(Abundance)/n_distinct(Sample)) %>% 
  arrange(-Abundance) %>% 
  pull(Species) %>% 
  .[1:3]

data_long %>%
  inner_join(
    data_long %>% 
      group_by(ATTRIBUTE_Study_Group, Sample) %>% 
      summarize(
        Staphylococcaceae = sum(Abundance[Family == "Staphylococcaceae"]),
        .groups = "drop_last"
        ) %>% 
      arrange(-Staphylococcaceae) %>% 
      mutate(
        Sample_No = row_number()
      ) %>% 
      select(ATTRIBUTE_Study_Group, Sample, Sample_No),
    by = c("ATTRIBUTE_Study_Group", "Sample")
  ) %>% 
  filter(Family == "Staphylococcaceae") %>% 
  mutate(
    `Relative abundance` = Abundance,
    Staphylococci = if_else(Species %in% Staphylococci, Species, "Other Staphylococci")
    ) %>% 
  ggplot() +
  geom_col(
    aes(interaction(Sample_No, ATTRIBUTE_Study_Group), `Relative abundance`, fill = Staphylococci)
    ) +
  scale_x_discrete(guide = guide_axis_nested(n.dodge = 0, title = "Study group")) +
  theme(
    axis.ticks.x = element_blank(),
    strip.text.x = element_text(angle = 90, vjust = 0.5, hjust=0),
    legend.position = "right",
    legend.title = element_blank()
    ) +
  scale_fill_brewer(
    palette = "Set1",
    limits = Staphylococci %>% c("Other Staphylococci")
    )

Which taxa should be summarized into Staphylococcus_caprae_capitis?

taxa_staph_caprae_capitis <- tax_table %>% 
  filter(Species == "Staphylococcus_caprae_capitis") %>% 
  pull(OTU)

tax_table %>% 
  filter(OTU %in% taxa_staph_caprae_capitis)

Which taxa should be summarized into Staphylococcus_epidermidis?

taxa_staph_epidermidis <- tax_table %>% 
  filter(Species == "Staphylococcus_epidermidis") %>% 
  pull(OTU)

tax_table %>% 
  filter(OTU %in% taxa_staph_epidermidis)

Which taxa should be summarized into Staphylococcus_hominis?

taxa_staph_hominis <- tax_table %>% 
  filter(Species == "Staphylococcus_hominis") %>% 
  pull(OTU)

tax_table %>% 
  filter(OTU %in% taxa_staph_hominis)

Which taxa should be summarized as other Staphylococci?

taxa_staph_other <- tax_table %>% 
  filter(
    Family == "Staphylococcaceae" & 
      !OTU %in% c(taxa_staph_caprae_capitis, taxa_staph_epidermidis, taxa_staph_hominis)
    ) %>% 
  pull(OTU)

tax_table %>% 
  filter(OTU %in% taxa_staph_other)

Polyomaviridae

Look at Polyomaviridae:

Polyomaviri <- data_long %>% 
  filter(Family == "Polyomaviridae") %>% 
  group_by(Species) %>% 
  summarize(Abundance = sum(Abundance)/n_distinct(Sample)) %>% 
  arrange(-Abundance) %>% 
  pull(Species)

data_long %>%
  inner_join(
    data_long %>% 
      group_by(ATTRIBUTE_Study_Group, Sample) %>% 
      summarize(
        Polyomaviridae = sum(Abundance[Family == "Polyomaviridae"]),
        .groups = "drop_last"
        ) %>% 
      arrange(-Polyomaviridae) %>% 
      mutate(
        Sample_No = row_number()
      ) %>% 
      select(ATTRIBUTE_Study_Group, Sample, Sample_No),
    by = c("ATTRIBUTE_Study_Group", "Sample")
  ) %>% 
  filter(Family == "Polyomaviridae") %>% 
  mutate(`Relative abundance` = Abundance) %>% 
  ggplot() +
  geom_col(
    aes(interaction(Sample_No, ATTRIBUTE_Study_Group), `Relative abundance`, fill = Species)
    ) +
  scale_x_discrete(guide = guide_axis_nested(n.dodge = 0, title = "Study group")) +
  theme(
    axis.ticks.x = element_blank(),
    strip.text.x = element_text(angle = 90, vjust = 0.5, hjust=0),
    legend.position = "right",
    legend.title = element_blank()
    ) +
  scale_fill_brewer(palette = "Set1", limits = Polyomaviri)

Which taxa should be summarized into Polyomavirus_HPyV6?

taxa_poly_hpyv6 <- tax_table %>% 
  filter(Species == "Polyomavirus_HPyV6") %>% 
  pull(OTU)

tax_table %>% 
  filter(OTU %in% taxa_poly_hpyv6)

Which taxa should be summarized into Polyomavirus_HPyV7

taxa_poly_hpyv7 <- tax_table %>% 
  filter(Species == "Polyomavirus_HPyV7") %>% 
  pull(OTU)

tax_table %>% 
  filter(OTU %in% taxa_poly_hpyv7)

Which taxa should be summarized into Merkel_cell_polyomavirus?

taxa_poly_merkel <- tax_table %>% 
  filter(Species == "Merkel_cell_polyomavirus") %>% 
  pull(OTU)

tax_table %>% 
  filter(OTU %in% taxa_poly_merkel)

Skin paramters

Plot relative abundance of the major families, sorted by sebumeter score:

data_long %>%
  inner_join(
    data_long %>% 
      group_by(ATTRIBUTE_Study_Group, Sample) %>% 
      summarize(
        Sebumeter_Score = Sebumeter_Score[1],
        .groups = "drop_last"
        ) %>% 
      arrange(Sebumeter_Score) %>% 
      mutate(
        Sample_No = row_number()
      ) %>% 
      select(ATTRIBUTE_Study_Group, Sample, Sample_No),
    by = c("ATTRIBUTE_Study_Group", "Sample")
  ) %>% 
  mutate(`Relative abundance` = Abundance) %>% 
  ggplot() +
  geom_col(
    aes(interaction(Sample_No, ATTRIBUTE_Study_Group), `Relative abundance`, fill = Major_Families)
    ) +
  scale_x_discrete(guide = guide_axis_nested(n.dodge = 0, title = "Study group")) +
  theme(
    axis.ticks.x = element_blank(),
    strip.text.x = element_text(angle = 90, vjust = 0.5, hjust=0),
    legend.position = "bottom",
    legend.title = element_blank()
    ) +
  scale_fill_brewer(palette = "Set1")

data_long %>% 
  group_by(ATTRIBUTE_Study_Group, Sample) %>% 
  summarize(
    Sebumeter_Score = Sebumeter_Score[1],
    .groups = "drop_last"
    ) %>% 
  arrange(Sebumeter_Score) %>% 
  mutate(
    Sample_No = row_number(),
    `Sebumeter score` = Sebumeter_Score
  ) %>% 
  ggplot() +
  geom_col(
    aes(interaction(Sample_No, ATTRIBUTE_Study_Group), `Sebumeter score`, fill = "")
    ) +
  scale_x_discrete(guide = guide_axis_nested(n.dodge = 0, title = "Study group")) +
  theme(
    axis.ticks.x = element_blank(),
    strip.text.x = element_text(angle = 90, vjust = 0.5, hjust=0),
    legend.position = "none",
    legend.title = element_blank()
    ) +
  scale_fill_brewer(palette = "Set1")

Plot relative abundance of the major families, sorted by Skicon score:

data_long %>%
  inner_join(
    data_long %>% 
      group_by(Sample) %>% 
      summarize(
        Skicon_Score = Skicon_Score[1],
        .groups = "drop_last"
        ) %>% 
      arrange(Skicon_Score) %>% 
      mutate(
        Sample_No = row_number()
      ) %>% 
      select(Sample, Sample_No),
    by = "Sample"
  ) %>% 
  mutate(`Relative abundance` = Abundance) %>% 
  ggplot() +
  geom_col(
    aes(Sample_No, `Relative abundance`, fill = Major_Families)
    ) +
  theme(
    axis.ticks.x = element_blank(),
    axis.text.x = element_blank(),
    axis.title.x = element_blank(),
    legend.position = "bottom",
    legend.title = element_blank()
    ) +
  scale_fill_brewer(palette = "Set1")

data_long %>% 
  group_by(Sample) %>% 
  summarize(
    Skicon_Score = Skicon_Score[1],
    .groups = "drop_last"
    ) %>% 
  arrange(Skicon_Score) %>% 
  mutate(
    Sample_No = row_number(),
    `Skicon score` = Skicon_Score
  ) %>% 
  ggplot() +
  geom_col(
    aes(Sample_No, `Skicon score`, fill = "")
    ) +
  theme(
    axis.ticks.x = element_blank(),
    axis.text.x = element_blank(),
    axis.title.x = element_blank(),
    legend.position = "none",
    legend.title = element_blank()
    ) +
  scale_fill_brewer(palette = "Set1")

Ordination analysis

Define function for axis labels:

get_axis_labels <- function(x) str_c(
  "PC",
  1:length(x$values$Relative_eig),
  " [",
  (100 * x$values$Relative_eig) %>% round(1),
  "%]"
  )

Bray-Curtis distance

Calculate ordination:

ordination <- physeq %>% 
  ordinate("PCoA", "bray")

Plot samples:

axis_labels <- get_axis_labels(ordination)

plot_ordination(
  physeq,
  ordination,
  type = "samples",
  color = "Dominat_Propioni"
  ) +
  scale_color_brewer(palette = "Set1", limits = c(T, F)) +
  labs(
    title = "PCoA, Bray-Curtis dissimilarity",
    x = axis_labels[1], y = axis_labels[2]
    )
## Warning in plot_ordination(physeq, ordination, type = "samples", color =
## "Dominat_Propioni"): Color variable was not found in the available data you
## provided.No color mapped.

plot_ordination(
  physeq,
  ordination,
  type = "samples",
  color = "ATTRIBUTE_Study_Group"
  ) +
  scale_color_brewer(palette = "Set1") +
  labs(
    title = "PCoA, Bray-Curtis dissimilarity",
    x = axis_labels[1], y = axis_labels[2]
    )

plot_ordination(
  physeq,
  ordination,
  type = "samples",
  color = "ATTRIBUTE_Ethnic_Group"
  ) +
  scale_color_brewer(palette = "Set1") +
  labs(
    title = "PCoA, Bray-Curtis dissimilarity",
    x = axis_labels[1], y = axis_labels[2]
    )

plot_ordination(
  physeq,
  ordination,
  type = "samples",
  color = "ATTRIBUTE_BiologicalSex"
  ) +
  scale_color_brewer(palette = "Set1") +
  labs(
    title = "PCoA, Bray-Curtis dissimilarity",
    x = axis_labels[1], y = axis_labels[2]
    )

plot_ordination(
  physeq,
  ordination,
  type = "samples",
  color = "Sebumeter_Score"
  ) +
  scale_color_viridis_c() +
  labs(
    title = "PCoA, Bray-Curtis dissimilarity",
    x = axis_labels[1], y = axis_labels[2]
    )

plot_ordination(
  physeq,
  ordination,
  type = "samples",
  color = "Skicon_Score"
  ) +
  scale_color_viridis_c() +
  labs(
    title = "PCoA, Bray-Curtis dissimilarity",
    x = axis_labels[1], y = axis_labels[2]
    )

plot_ordination(
  physeq,
  ordination,
  type = "samples",
  color = "AgeInYears"
  ) +
  scale_color_viridis_c() +
  labs(
    title = "PCoA, Bray-Curtis dissimilarity",
    x = axis_labels[1], y = axis_labels[2]
    )

plot_ordination(
  physeq,
  ordination,
  type = "samples",
  color = "Propionibacteriaceae"
  ) +
  scale_color_brewer(palette = "Set1", limits = c(T, F)) +
  labs(
    title = "PCoA, Bray-Curtis dissimilarity",
    x = axis_labels[1], y = axis_labels[2]
    )

plot_ordination(
  physeq,
  ordination,
  type = "samples",
  color = "Polyomaviridae"
  ) +
  scale_color_brewer(palette = "Set1", limits = c(T, F)) +
  labs(
    title = "PCoA, Bray-Curtis dissimilarity",
    x = axis_labels[1], y = axis_labels[2]
    )

  • There is no real separation into groups with oily or normal scalp.
  • Samples with high sebumeter or skicon score are in the upper half of the plot, but samples with low sebumeter or skicon score are all around.

Plot families:

plot_ordination(
  physeq,
  ordination,
  type = "taxa",
  color = "Major_Families"
  ) +
  scale_color_brewer(palette = "Set1", limits = c(major_families, "Other families"), name = NULL) +
  labs(
    title = "PCoA, Bray-Curtis dissimilarity",
    x = axis_labels[1], y = axis_labels[2],
    )

Datasets for MMVec

Reasonable taxon resolution

Aggregate dataset by major taxa. Check whether the sum of the abundances within each sample is 1 and whether no taxa were lost:

data_major_taxa_long <- data_long %>% 
  mutate(
    Major_Taxa = case_when(
      OTU %in% taxa_poly_hpyv6  ~ "Polyomavirus HPyV6",
      OTU %in% taxa_poly_hpyv7  ~ "Polyomavirus HPyV7",
      OTU %in% taxa_poly_merkel ~ "Merkel Cell Polyomavirus",
      OTU %in% taxa_staph_caprae_capitis  ~ "Staphylococcus caprae or capitis",
      OTU %in% taxa_staph_epidermidis     ~ "Staphylococcus epidermidis",
      OTU %in% taxa_staph_hominis         ~ "Staphylococcus hominis",
      OTU %in% taxa_staph_other           ~ "Other Staphylococci",
      T ~ Major_Families
    ),
    Major_Taxa = Major_Taxa %>% factor(levels = c("Propionibacteriaceae", "Staphylococcus caprae or capitis", "Staphylococcus epidermidis", "Staphylococcus hominis", "Other Staphylococci", "Polyomavirus HPyV6", "Polyomavirus HPyV7", "Merkel Cell Polyomavirus", "Malasseziaceae", "Corynebacteriaceae", "Micrococcaceae", "Other families"))
  ) %>% 
  group_by(across(c(sample_attributes, Major_Families, Major_Taxa))) %>% 
  summarize(
    N_Taxa = n(),
    Abundance = sum(Abundance), 
    .groups = "drop"
    )
## Warning: There was 1 warning in `group_by()`.
## ℹ In argument: `across(c(sample_attributes, Major_Families, Major_Taxa))`.
## Caused by warning:
## ! Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
##   # Was:
##   data %>% select(sample_attributes)
## 
##   # Now:
##   data %>% select(all_of(sample_attributes))
## 
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
data_major_taxa_long %>% 
  group_by(Sample) %>% 
  summarize(Abundance = sum(Abundance), .groups = "drop")
data_major_taxa_long %>% 
  group_by(Major_Taxa) %>% 
  summarize(N_Taxa = N_Taxa[1]) %>% 
  summarize(N_Taxa = sum(N_Taxa))

Transform to wide format and save to file:

data_major_taxa_wide <- data_major_taxa_long %>% 
  pivot_wider(id_cols = Major_Taxa, names_from = Sample, values_from = Abundance)

data_major_taxa_wide %>% write_tsv("Data/data_major_taxa_wide.tsv")

data_major_taxa_wide

Plot relative abundances:

data_major_taxa_long %>%
  inner_join(
    data_major_taxa_long %>% 
      filter(Major_Taxa == "Propionibacteriaceae") %>% 
      arrange(-Abundance) %>% 
      group_by(ATTRIBUTE_Study_Group) %>% 
      mutate(
        Sample_No = row_number()
      ) %>% 
      select(ATTRIBUTE_Study_Group, Sample, Sample_No),
    by = c("ATTRIBUTE_Study_Group", "Sample")
  ) %>% 
  mutate(`Relative abundance` = Abundance) %>% 
  ggplot() +
  geom_col(
    aes(interaction(Sample_No, ATTRIBUTE_Study_Group), `Relative abundance`, fill = Major_Taxa)
    ) +
  scale_x_discrete(guide = guide_axis_nested(n.dodge = 0, title = "Study group")) +
  theme(
    axis.ticks.x = element_blank(),
    strip.text.x = element_text(angle = 90, vjust = 0.5, hjust=0),
    legend.position = "right",
    legend.title = element_blank()
    ) +
  scale_fill_brewer(palette = "Set3")

Plot relative abundances, sorted by sebumeter score:

data_major_taxa_long %>%
  inner_join(
    data_major_taxa_long %>% 
      filter(Major_Taxa == "Propionibacteriaceae") %>% 
      arrange(Sebumeter_Score) %>% 
      group_by(ATTRIBUTE_Study_Group) %>% 
      mutate(
        Sample_No = row_number()
      ) %>% 
      select(ATTRIBUTE_Study_Group, Sample, Sample_No),
    by = c("ATTRIBUTE_Study_Group", "Sample")
  ) %>% 
  mutate(`Relative abundance` = Abundance) %>% 
  ggplot() +
  geom_col(
    aes(interaction(Sample_No, ATTRIBUTE_Study_Group), `Relative abundance`, fill = Major_Taxa)
    ) +
  scale_x_discrete(guide = guide_axis_nested(n.dodge = 0, title = "Study group")) +
  theme(
    axis.ticks.x = element_blank(),
    strip.text.x = element_text(angle = 90, vjust = 0.5, hjust=0),
    legend.position = "right",
    legend.title = element_blank()
    ) +
  scale_fill_brewer(palette = "Set3")

data_major_taxa_long %>% 
  group_by(ATTRIBUTE_Study_Group, Sample) %>% 
  summarize(
    Sebumeter_Score = Sebumeter_Score[1],
    .groups = "drop_last"
    ) %>% 
  arrange(Sebumeter_Score) %>% 
  mutate(
    Sample_No = row_number(),
    `Sebumeter score` = Sebumeter_Score
  ) %>% 
  ggplot() +
  geom_col(
    aes(interaction(Sample_No, ATTRIBUTE_Study_Group), `Sebumeter score`, fill = "")
    ) +
  scale_x_discrete(guide = guide_axis_nested(n.dodge = 0, title = "Study group")) +
  theme(
    axis.ticks.x = element_blank(),
    strip.text.x = element_text(angle = 90, vjust = 0.5, hjust=0),
    legend.position = "none",
    legend.title = element_blank()
    ) +
  scale_fill_brewer(palette = "Set1")

Plot relative abundances, sorted by sebumeter score:

data_major_taxa_long %>%
  inner_join(
    data_major_taxa_long %>% 
      filter(Major_Taxa == "Propionibacteriaceae") %>% 
      arrange(Skicon_Score) %>% 
      mutate(
        Sample_No = row_number()
      ) %>% 
      select(ATTRIBUTE_Study_Group, Sample, Sample_No),
    by = c("ATTRIBUTE_Study_Group", "Sample")
  ) %>% 
  mutate(`Relative abundance` = Abundance) %>% 
  ggplot() +
  geom_col(
    aes(Sample_No, `Relative abundance`, fill = Major_Taxa)
    ) +
  theme(
    axis.ticks.x = element_blank(),
    axis.text.x = element_blank(),
    axis.title.x = element_blank(),
    legend.position = "right",
    legend.title = element_blank()
    ) +
  scale_fill_brewer(palette = "Set3")

data_major_taxa_long %>% 
  group_by(Sample) %>% 
  summarize(
    Skicon_Score = Skicon_Score[1],
    .groups = "drop_last"
    ) %>% 
  arrange(Skicon_Score) %>% 
  mutate(
    Sample_No = row_number(),
    `Skicon score` = Skicon_Score
  ) %>% 
  ggplot() +
  geom_col(
    aes(Sample_No, `Skicon score`, fill = "")
    ) +
  theme(
    axis.ticks.x = element_blank(),
    axis.text.x = element_blank(),
    axis.title.x = element_blank(),
    legend.position = "none",
    legend.title = element_blank()
    ) +
  scale_fill_brewer(palette = "Set1")

Propionibacteriaceae and Staphylococcaceae,

Filter dataset for Propionibacteriaceae and Staphylococcaceae and re-scale the abundances within each sample to 1:

data_propi_staph_long <- data_major_taxa_long %>% 
  filter(Major_Families %in% c("Propionibacteriaceae", "Staphylococcaceae")) %>% 
  group_by(Sample) %>% 
  mutate(Abundance = Abundance / sum(Abundance)) %>% 
  ungroup()

data_propi_staph_long %>% 
  group_by(Sample) %>% 
  summarize(Abundance = sum(Abundance), .groups = "drop")
data_propi_staph_long %>% 
  group_by(Major_Taxa) %>% 
  summarize(N_Taxa = N_Taxa[1]) %>% 
  summarize(N_Taxa = sum(N_Taxa))

Transform to wide format and save to file:

data_propi_staph_wide <- data_propi_staph_long %>% 
  pivot_wider(id_cols = Major_Taxa, names_from = Sample, values_from = Abundance)

data_propi_staph_wide %>% write_tsv("Data/data_propi_staph_wide.tsv")

data_propi_staph_wide

Plot relative abundances:

data_propi_staph_long %>%
  inner_join(
    data_propi_staph_long %>% 
      filter(Major_Taxa == "Propionibacteriaceae") %>% 
      arrange(-Abundance) %>% 
      group_by(ATTRIBUTE_Study_Group) %>% 
      mutate(
        Sample_No = row_number()
      ) %>% 
      select(ATTRIBUTE_Study_Group, Sample, Sample_No),
    by = c("ATTRIBUTE_Study_Group", "Sample")
  ) %>% 
  mutate(`Relative abundance` = Abundance) %>% 
  ggplot() +
  geom_col(
    aes(interaction(Sample_No, ATTRIBUTE_Study_Group), `Relative abundance`, fill = Major_Taxa)
    ) +
  scale_x_discrete(guide = guide_axis_nested(n.dodge = 0, title = "Study group")) +
  theme(
    axis.ticks.x = element_blank(),
    strip.text.x = element_text(angle = 90, vjust = 0.5, hjust=0),
    legend.position = "right",
    legend.title = element_blank()
    ) +
  scale_fill_brewer(palette = "Set3")

Plot relative abundances, sorted by sebumeter score:

data_propi_staph_long %>%
  inner_join(
    data_propi_staph_long %>% 
      filter(Major_Taxa == "Propionibacteriaceae") %>% 
      arrange(Sebumeter_Score) %>% 
      group_by(ATTRIBUTE_Study_Group) %>% 
      mutate(
        Sample_No = row_number()
      ) %>% 
      select(ATTRIBUTE_Study_Group, Sample, Sample_No),
    by = c("ATTRIBUTE_Study_Group", "Sample")
  ) %>% 
  mutate(`Relative abundance` = Abundance) %>% 
  ggplot() +
  geom_col(
    aes(interaction(Sample_No, ATTRIBUTE_Study_Group), `Relative abundance`, fill = Major_Taxa)
    ) +
  scale_x_discrete(guide = guide_axis_nested(n.dodge = 0, title = "Study group")) +
  theme(
    axis.ticks.x = element_blank(),
    strip.text.x = element_text(angle = 90, vjust = 0.5, hjust=0),
    legend.position = "right",
    legend.title = element_blank()
    ) +
  scale_fill_brewer(palette = "Set3")

data_propi_staph_long %>% 
  group_by(ATTRIBUTE_Study_Group, Sample) %>% 
  summarize(
    Sebumeter_Score = Sebumeter_Score[1],
    .groups = "drop_last"
    ) %>% 
  arrange(Sebumeter_Score) %>% 
  mutate(
    Sample_No = row_number(),
    `Sebumeter score` = Sebumeter_Score
  ) %>% 
  ggplot() +
  geom_col(
    aes(interaction(Sample_No, ATTRIBUTE_Study_Group), `Sebumeter score`, fill = "")
    ) +
  scale_x_discrete(guide = guide_axis_nested(n.dodge = 0, title = "Study group")) +
  theme(
    axis.ticks.x = element_blank(),
    strip.text.x = element_text(angle = 90, vjust = 0.5, hjust=0),
    legend.position = "none",
    legend.title = element_blank()
    ) +
  scale_fill_brewer(palette = "Set1")

Plot relative abundances, sorted by sebumeter score:

data_propi_staph_long %>%
  inner_join(
    data_propi_staph_long %>% 
      filter(Major_Taxa == "Propionibacteriaceae") %>% 
      arrange(Skicon_Score) %>% 
      mutate(
        Sample_No = row_number()
      ) %>% 
      select(ATTRIBUTE_Study_Group, Sample, Sample_No),
    by = c("ATTRIBUTE_Study_Group", "Sample")
  ) %>% 
  mutate(`Relative abundance` = Abundance) %>% 
  ggplot() +
  geom_col(
    aes(Sample_No, `Relative abundance`, fill = Major_Taxa)
    ) +
  theme(
    axis.ticks.x = element_blank(),
    axis.text.x = element_blank(),
    axis.title.x = element_blank(),
    legend.position = "right",
    legend.title = element_blank()
    ) +
  scale_fill_brewer(palette = "Set3")

data_propi_staph_long %>% 
  group_by(Sample) %>% 
  summarize(
    Skicon_Score = Skicon_Score[1],
    .groups = "drop_last"
    ) %>% 
  arrange(Skicon_Score) %>% 
  mutate(
    Sample_No = row_number(),
    `Skicon score` = Skicon_Score
  ) %>% 
  ggplot() +
  geom_col(
    aes(Sample_No, `Skicon score`, fill = "")
    ) +
  theme(
    axis.ticks.x = element_blank(),
    axis.text.x = element_blank(),
    axis.title.x = element_blank(),
    legend.position = "none",
    legend.title = element_blank()
    ) +
  scale_fill_brewer(palette = "Set1")

Session Info

timestamp()
## ##------ Sun Jul 23 17:28:37 2023 ------##
sessionInfo()
## R version 4.3.0 (2023-04-21)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.5 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/liblapack.so.3;  LAPACK version 3.9.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: UTC
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] ggh4x_0.2.5       microbiome_1.22.0 phyloseq_1.44.0   lubridate_1.9.2  
##  [5] forcats_1.0.0     stringr_1.5.0     dplyr_1.1.2       purrr_1.0.1      
##  [9] readr_2.1.4       tidyr_1.3.0       tibble_3.2.1      ggplot2_3.4.2    
## [13] tidyverse_2.0.0  
## 
## loaded via a namespace (and not attached):
##  [1] ade4_1.7-22             tidyselect_1.2.0        viridisLite_0.4.2      
##  [4] farver_2.1.1            Biostrings_2.68.1       bitops_1.0-7           
##  [7] fastmap_1.1.1           RCurl_1.98-1.12         digest_0.6.33          
## [10] timechange_0.2.0        lifecycle_1.0.3         cluster_2.1.4          
## [13] survival_3.5-5          magrittr_2.0.3          compiler_4.3.0         
## [16] rlang_1.1.1             sass_0.4.7              tools_4.3.0            
## [19] igraph_1.5.0            utf8_1.2.3              yaml_2.3.7             
## [22] data.table_1.14.8       knitr_1.43              labeling_0.4.2         
## [25] bit_4.0.5               RColorBrewer_1.1-3      plyr_1.8.8             
## [28] Rtsne_0.16              withr_2.5.0             BiocGenerics_0.46.0    
## [31] grid_4.3.0              stats4_4.3.0            fansi_1.0.4            
## [34] multtest_2.56.0         biomformat_1.28.0       colorspace_2.1-0       
## [37] Rhdf5lib_1.22.0         scales_1.2.1            iterators_1.0.14       
## [40] MASS_7.3-60             cli_3.6.1               rmarkdown_2.23         
## [43] vegan_2.6-4             crayon_1.5.2            generics_0.1.3         
## [46] rstudioapi_0.15.0.9000  reshape2_1.4.4          tzdb_0.4.0             
## [49] ape_5.7-1               cachem_1.0.8            rhdf5_2.44.0           
## [52] zlibbioc_1.46.0         splines_4.3.0           parallel_4.3.0         
## [55] XVector_0.40.0          vctrs_0.6.3             Matrix_1.6-0           
## [58] jsonlite_1.8.7          IRanges_2.34.1          hms_1.1.3              
## [61] S4Vectors_0.38.1        bit64_4.0.5             foreach_1.5.2          
## [64] jquerylib_0.1.4         glue_1.6.2              codetools_0.2-19       
## [67] stringi_1.7.12          gtable_0.3.3            GenomeInfoDb_1.36.1    
## [70] munsell_0.5.0           pillar_1.9.0            htmltools_0.5.5        
## [73] rhdf5filters_1.12.1     GenomeInfoDbData_1.2.10 R6_2.5.1               
## [76] vroom_1.6.3             evaluate_0.21           lattice_0.21-8         
## [79] Biobase_2.60.0          highr_0.10              bslib_0.5.0            
## [82] Rcpp_1.0.11             nlme_3.1-162            permute_0.9-7          
## [85] mgcv_1.9-0              xfun_0.39               pkgconfig_2.0.3